import numpy as np
import pandas as pd
import re
# sklearn
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze a weather dataset from Kaggle.com.
Data description from Kaggle:
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Path = 'weatherdata/daily_weather.csv'
Data = pd.read_csv(Path)
Data.drop(columns = ['number'], inplace = True)
Data.columns = [x.replace('ty_9am','ty_(Morning)').replace('3pm', '(Afternoon)').replace('_9am', '')\
.replace('_',' ').title().replace('Temp','Temperature') for x in Data.columns.tolist()]
Header('Dataset:')
display(Data.head(10).style.hide_index().set_precision(2))
Dataset_features = pd.read_csv(Path.split("/")[0] + '/dataset_features.csv', sep = ';')
Header('Feature Description:')
display(Dataset_features.style.hide_index())
Dataset: ===========================================================================================
| Air Pressure | Air Temperature | Avg Wind Direction | Avg Wind Speed | Max Wind Direction | Max Wind Speed | Rain Accumulation | Rain Duration | Relative Humidity (Morning) | Relative Humidity (Afternoon) |
|---|---|---|---|---|---|---|---|---|---|
| 918.06 | 74.82 | 271.10 | 2.08 | 295.40 | 2.86 | 0.00 | 0.00 | 42.42 | 36.16 |
| 917.35 | 71.40 | 101.94 | 2.44 | 140.47 | 3.53 | 0.00 | 0.00 | 24.33 | 19.43 |
| 923.04 | 60.64 | 51.00 | 17.07 | 63.70 | 22.10 | 0.00 | 20.00 | 8.90 | 14.46 |
| 920.50 | 70.14 | 198.83 | 4.34 | 211.20 | 5.19 | 0.00 | 0.00 | 12.19 | 12.74 |
| 921.16 | 44.29 | 277.80 | 1.86 | 136.50 | 2.86 | 8.90 | 14730.00 | 92.41 | 76.74 |
| 915.30 | 78.40 | 182.80 | 9.93 | 189.00 | 10.98 | 0.02 | 170.00 | 35.13 | 33.93 |
| 915.60 | 70.04 | 177.88 | 3.75 | 186.61 | 4.59 | 0.00 | 0.00 | 10.66 | 21.39 |
| 918.07 | 51.71 | 242.40 | 2.53 | 271.60 | 3.65 | 0.00 | 0.00 | 80.47 | 74.92 |
| 920.08 | 80.58 | 40.70 | 4.52 | 63.00 | 5.88 | 0.00 | 0.00 | 29.58 | 24.03 |
| 915.01 | 47.50 | 163.10 | 4.94 | 195.90 | 6.58 | 0.00 | 0.00 | 88.60 | 68.05 |
Feature Description: ===============================================================================
| Feature | Description |
|---|---|
| Air Pressure | Air pressure StartFragment in hectopascal (100 pascals) at 9 AM |
| Air Temperature | Air temperature in degrees Fahrenheit at 9 AM |
| Avg Wind Direction | Average wind direction over the minute before the timestamp in degrees (0 starts from the north) at 9 AM |
| Avg Wind Speed | Average wind speed over the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Max Wind Direction | Highest wind direction in the minute before the timestamp in degrees (0 starts from the north) at 9 AM |
| Max Wind Speed | Highest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Min Wind Speed | Smallest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Rain Accumulation | Accumulated rain in millimeters (mm) at 9 AM |
| Rain Duration | Length of time rain in seconds (s) at 9 AM |
| Relative Humidity (Morning) | Relative humidity in percentage in at 9 AM |
| Relative Humidity (Afternoon) | Relative humidity in percentage at 3 PM |
Imputing Missing Values:
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'Daily Weather Dataset', W = 800)
Temp = data_info.loc[data_info['Number of NaN Values']>0, 'Features'].tolist()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Data[Temp] = imp.fit_transform(Data[Temp])
del imp, Temp
_ = Data_Plot(Data, Title = 'Daily Weather Dataset', W = 800)
Let's set Relative Humidity (Afternoon) as the target variable. This means given the dataset and using the rest of the features, we would like to know whether is humid or not at 3 PM. In doing so, we can consider the median of Relative Humidity (Afternoon). Then, assign 1 to values over or equal the median value, and 0 to values under the median value.
Median = Data['Relative Humidity (Afternoon)'].median()
Temp = Data['Relative Humidity (Afternoon)']> Median
Temp = Temp.astype(int)
Target = 'Relative Humidity (Afternoon)'
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
def Feature_Normalize(Inp, Target, PD):
X = Inp.drop(columns = [Target])
# scaling data
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns = X.columns)
fig, ax = plt.subplots(2, 1, figsize = PD['figsize'])
ax = ax.ravel()
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", X.shape[1])]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": PD['annot_text_size']},
cbar_kws=kws)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], weight='bold', fontsize = 14)
del Temp
plt.subplots_adjust(hspace=PD['hspace'])
Out = Inp.copy()
Out[X.columns] = X_std.copy()
return Out
PD = dict(figsize = (12, 8), hspace = 0.2, annot_text_size = 12)
df = Feature_Normalize(Data, Target, PD)
df.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)
def Correlation_Plot (Inp, Fig_Size):
Correlation_Matrix = Inp.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})
Correlation_Plot (df, 8)